<!DOCTYPE html>












  




<html class="theme-next gemini use-motion" lang="zh-CN">
<head>
  <!-- hexo-inject:begin --><!-- hexo-inject:end --><meta charset="UTF-8"/>
<meta name="google-site-verification" content="o9IkI77-fxkhBZW-n0ww9JALMCqdDbeTgdcXO_Bw4Zc" />
<meta name="baidu-site-verification" content="3frqY9KiVO" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2"/>
<meta name="theme-color" content="#222">



  
  
  <link rel="stylesheet" href="/lib/needsharebutton/needsharebutton.css">










<meta http-equiv="Cache-Control" content="no-transform" />
<meta http-equiv="Cache-Control" content="no-siteapp" />



















  
  
  
  

  
    
    
  

  
    
      
    

    
  

  

  
    
      
    

    
  

  
    
      
    

    
  

  
    
    
    <link href="//fonts.googleapis.com/css?family=Monda:300,300italic,400,400italic,700,700italic|Roboto Slab:300,300italic,400,400italic,700,700italic|Lobster Two:300,300italic,400,400italic,700,700italic|PT Mono:300,300italic,400,400italic,700,700italic&subset=latin,latin-ext" rel="stylesheet" type="text/css">
  






<link href="/lib/font-awesome/css/font-awesome.min.css?v=4.6.2" rel="stylesheet" type="text/css" />

<link href="/css/main.css?v=6.4.1" rel="stylesheet" type="text/css" />


  <link rel="apple-touch-icon" sizes="180x180" href="/images/logo.png?v=6.4.1">


  <link rel="icon" type="image/png" sizes="32x32" href="/images/logo.png?v=6.4.1">


  <link rel="icon" type="image/png" sizes="16x16" href="/images/logo.png?v=6.4.1">


  <link rel="mask-icon" href="/images/logo.svg?v=6.4.1" color="#222">









<script type="text/javascript" id="hexo.configurations">
  var NexT = window.NexT || {};
  var CONFIG = {
    root: '/',
    scheme: 'Gemini',
    version: '6.4.1',
    sidebar: {"position":"left","display":"post","offset":12,"b2t":false,"scrollpercent":false,"onmobile":false},
    fancybox: false,
    fastclick: false,
    lazyload: false,
    tabs: true,
    motion: {"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},
    algolia: {
      applicationID: '',
      apiKey: '',
      indexName: '',
      hits: {"per_page":10},
      labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
    }
  };
</script>


  




  <meta name="description" content="摘要：大数据技术与我们日常生活越来越紧密，要做大数据，首要解决数据问题。原始数据存在大量不完整、不一致、有异常的数据，严重影响到数据建模的执行效率，甚至可能导致模型结果的偏差，因此要数据预处。数据预处理主要是将原始数据经过文本抽取、数据清理、数据集成、数据处理、数据变换、数据降维等处理后，不仅提高了数据质量，而且更好的提升算法模型性能。数据预处理在数据挖掘、自然语言处理、机器学习、深度学习算法中">
<meta name="keywords" content="Python,数据预处理,pywin32">
<meta property="og:type" content="article">
<meta property="og:title" content="Python数据预处理之抽取文本信息（2）">
<meta property="og:url" content="https://bainingchao.github.io/2018/12/21/数据预处理之抽取文本信息（2）/index.html">
<meta property="og:site_name" content="白宁超的官网">
<meta property="og:description" content="摘要：大数据技术与我们日常生活越来越紧密，要做大数据，首要解决数据问题。原始数据存在大量不完整、不一致、有异常的数据，严重影响到数据建模的执行效率，甚至可能导致模型结果的偏差，因此要数据预处。数据预处理主要是将原始数据经过文本抽取、数据清理、数据集成、数据处理、数据变换、数据降维等处理后，不仅提高了数据质量，而且更好的提升算法模型性能。数据预处理在数据挖掘、自然语言处理、机器学习、深度学习算法中">
<meta property="og:locale" content="zh-CN">
<meta property="og:image" content="https://i.imgur.com/G5dR5VM.png">
<meta property="og:image" content="https://i.imgur.com/ifprovl.png">
<meta property="og:image" content="https://i.imgur.com/SKmSm17.png">
<meta property="og:image" content="https://i.imgur.com/xzvVZra.png">
<meta property="og:image" content="https://i.imgur.com/mvEnEch.png">
<meta property="og:image" content="http://pub.idqqimg.com/wpa/images/group.png">
<meta property="og:image" content="https://i.imgur.com/NEXhm2W.png">
<meta property="og:updated_time" content="2019-03-06T09:06:38.236Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="Python数据预处理之抽取文本信息（2）">
<meta name="twitter:description" content="摘要：大数据技术与我们日常生活越来越紧密，要做大数据，首要解决数据问题。原始数据存在大量不完整、不一致、有异常的数据，严重影响到数据建模的执行效率，甚至可能导致模型结果的偏差，因此要数据预处。数据预处理主要是将原始数据经过文本抽取、数据清理、数据集成、数据处理、数据变换、数据降维等处理后，不仅提高了数据质量，而且更好的提升算法模型性能。数据预处理在数据挖掘、自然语言处理、机器学习、深度学习算法中">
<meta name="twitter:image" content="https://i.imgur.com/G5dR5VM.png">



  <link rel="alternate" href="/atom.xml" title="白宁超的官网" type="application/atom+xml" />




  <link rel="canonical" href="https://bainingchao.github.io/2018/12/21/数据预处理之抽取文本信息（2）/"/>



<script type="text/javascript" id="page.configurations">
  CONFIG.page = {
    sidebar: "",
  };
</script>

  <title>Python数据预处理之抽取文本信息（2） | 白宁超的官网</title>
  









  <noscript>
  <style type="text/css">
    .use-motion .motion-element,
    .use-motion .brand,
    .use-motion .menu-item,
    .sidebar-inner,
    .use-motion .post-block,
    .use-motion .pagination,
    .use-motion .comments,
    .use-motion .post-header,
    .use-motion .post-body,
    .use-motion .collection-title { opacity: initial; }

    .use-motion .logo,
    .use-motion .site-title,
    .use-motion .site-subtitle {
      opacity: initial;
      top: initial;
    }

    .use-motion {
      .logo-line-before i { left: initial; }
      .logo-line-after i { right: initial; }
    }
  </style>
</noscript><!-- hexo-inject:begin --><!-- hexo-inject:end -->

</head>

<body itemscope itemtype="http://schema.org/WebPage" lang="zh-CN">

  
  
    
  

  <!-- hexo-inject:begin --><!-- hexo-inject:end --><div class="container sidebar-position-left page-post-detail">
    <div class="headband"></div>

	<!-- <a href="https://github.com/bainingchao"><img style="position: absolute; top: 0; right: 0; border: 0;" src="https://s3.amazonaws.com/github/ribbons/forkme_right_red_aa0000.png" alt="Fork me on GitHub"></a> !-->
	
    <header id="header" class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-wrapper">
  <div class="site-meta ">
    

    <div class="custom-logo-site-title">
      <a href="/" class="brand" rel="start">
        <span class="logo-line-before"><i></i></span>
        <span class="site-title">白宁超的官网</span>
        <span class="logo-line-after"><i></i></span>
      </a>
    </div>
    
      
        <h1 class="site-subtitle" itemprop="description">专注人工智能领域研究</h1>
      
    
  </div>

  <div class="site-nav-toggle">
    <button aria-label="切换导航栏">
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
    </button>
  </div>
</div>



<nav class="site-nav">
  
    <ul id="menu" class="menu">
      
        
        
        
          
          <li class="menu-item menu-item-首页">
    <a href="/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-home"></i> <br />首页</a>
  </li>
        
        
        
          
          <li class="menu-item menu-item-标签">
    <a href="/tags/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-tags"></i> <br />标签</a>
  </li>
        
        
        
          
          <li class="menu-item menu-item-分类">
    <a href="/categories/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-th"></i> <br />分类</a>
  </li>
        
        
        
          
          <li class="menu-item menu-item-归档">
    <a href="/archives/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-archive"></i> <br />归档</a>
  </li>
        
        
        
          
          <li class="menu-item menu-item-视频">
    <a href="/videos/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-sitemap"></i> <br />视频</a>
  </li>
        
        
        
          
          <li class="menu-item menu-item-书籍">
    <a href="/books/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-th"></i> <br />书籍</a>
  </li>
        
        
        
          
          <li class="menu-item menu-item-链接">
    <a href="/links/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-question-circle"></i> <br />链接</a>
  </li>
        
        
        
          
          <li class="menu-item menu-item-关于">
    <a href="/about/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-user"></i> <br />关于</a>
  </li>

      
      
        <li class="menu-item menu-item-search">
          
            <a href="javascript:;" class="popup-trigger">
          
            
              <i class="menu-item-icon fa fa-search fa-fw"></i> <br />搜索</a>
        </li>
      
    </ul>
  

  

  
    <div class="site-search">
      
  <div class="popup search-popup local-search-popup">
  <div class="local-search-header clearfix">
    <span class="search-icon">
      <i class="fa fa-search"></i>
    </span>
    <span class="popup-btn-close">
      <i class="fa fa-times-circle"></i>
    </span>
    <div class="local-search-input-wrapper">
      <input autocomplete="off"
             placeholder="搜索..." spellcheck="false"
             type="text" id="local-search-input">
    </div>
  </div>
  <div id="local-search-result"></div>
</div>



    </div>
  
</nav>



  



</div>
    </header>

    


    <main id="main" class="main">
      <div class="main-inner">
        <div class="content-wrap">
          
            

          
          <div id="content" class="content">
            

  <div id="posts" class="posts-expand">
    

  

  
  
  

  

  <article class="post post-type-normal" itemscope itemtype="http://schema.org/Article">
  
  
  
  <div class="post-block">
    <link itemprop="mainEntityOfPage" href="https://bainingchao.github.io/2018/12/21/数据预处理之抽取文本信息（2）/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="name" content="白宁超">
      <meta itemprop="description" content="本站主要研究深度学习、机器学习、自然语言处理等前沿技术。ML&NLP交流群：436303759 <span><a target="_blank" href="http://shang.qq.com/wpa/qunwpa?idkey=ef3bbb679b06ac59b136c57ba9e7935ff9d3b10faeabde6e4efcafe523bbbf4d"><img border="0" src="http://pub.idqqimg.com/wpa/images/group.png" alt="自然语言处理和机器学习技术QQ交流：436303759 " title="自然语言处理和机器学习技术交流"></a></span>">
      <meta itemprop="image" content="/../images/header.png">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="白宁超的官网">
    </span>

    
      <header class="post-header">

        
        
          <h2 class="post-title" itemprop="name headline">Python数据预处理之抽取文本信息（2）
              
            
          </h2>
        

        <div class="post-meta">
          <span class="post-time">

            
            
            

            
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              
                <span class="post-meta-item-text">发表于</span>
              

              
                
              

              <time title="创建时间：2018-12-21 14:06:12" itemprop="dateCreated datePublished" datetime="2018-12-21T14:06:12+08:00">2018-12-21</time>
            

            
              

              
                
                <span class="post-meta-divider">|</span>
                

                <span class="post-meta-item-icon">
                  <i class="fa fa-calendar-check-o"></i>
                </span>
                
                  <span class="post-meta-item-text">更新于</span>
                
                <time title="修改时间：2019-03-06 17:06:38" itemprop="dateModified" datetime="2019-03-06T17:06:38+08:00">2019-03-06</time>
              
            
          </span>

          
            <span class="post-category" >
            
              <span class="post-meta-divider">|</span>
            
              <span class="post-meta-item-icon">
                <i class="fa fa-folder-o"></i>
              </span>
              
                <span class="post-meta-item-text">分类于</span>
              
              
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing"><a href="/categories/数据预处理/" itemprop="url" rel="index"><span itemprop="name">数据预处理</span></a></span>

                
                
              
            </span>
          

          
            
          

          
          

          
            <span class="post-meta-divider">|</span>
            <span class="post-meta-item-icon"
            >
            <i class="fa fa-eye"></i>
             阅读次数： 
            <span class="busuanzi-value" id="busuanzi_value_page_pv" ></span>
            </span>
          
		  

          

          

        </div>
      </header>
    

    
    
    
    <div class="post-body" itemprop="articleBody">

      
      

      
        <blockquote>
<p>摘要：大数据技术与我们日常生活越来越紧密，要做大数据，首要解决数据问题。原始数据存在大量不完整、不一致、有异常的数据，严重影响到数据建模的执行效率，甚至可能导致模型结果的偏差，因此要数据预处。数据预处理主要是将原始数据经过文本抽取、数据清理、数据集成、数据处理、数据变换、数据降维等处理后，不仅提高了数据质量，而且更好的提升算法模型性能。数据预处理在数据挖掘、自然语言处理、机器学习、深度学习算法中起着重要的作用。（本文原创，转载必须注明出处.）</p>
</blockquote>
<a id="more"></a>
<h2 id="数据类型与数据采集"><a href="#数据类型与数据采集" class="headerlink" title="数据类型与数据采集"></a>数据类型与数据采集</h2><p>通常说的数据指的的数字、图表信息这些。在大数据领域所谓的数据总体包括结构化数据、半结构化数据和非结构化数据。</p>
<blockquote>
<p>结构化数据</p>
</blockquote>
<p>结构化的数据是指可以使用关系型数据库表示和存储，表现为二维形式的数据。一般特点是：数据以行为单位，一行数据表示一个实体的信息，每一行数据的属性是相同的。比如：</p>
<div class="table-container">
<table>
<thead>
<tr>
<th style="text-align:center">id</th>
<th style="text-align:center">name</th>
<th style="text-align:center">age</th>
<th style="text-align:center">gender</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:center">1</td>
<td style="text-align:center">张三</td>
<td style="text-align:center">12</td>
<td style="text-align:center">男</td>
</tr>
<tr>
<td style="text-align:center">2</td>
<td style="text-align:center">李花</td>
<td style="text-align:center">13</td>
<td style="text-align:center">女</td>
</tr>
<tr>
<td style="text-align:center">3</td>
<td style="text-align:center">王五</td>
<td style="text-align:center">18</td>
<td style="text-align:center">男</td>
</tr>
</tbody>
</table>
</div>
<ul>
<li>数据特点：关系模型数据，关系数据库表示。</li>
<li>常见格式：比如MySQL、Oracle、SQL Server等。</li>
<li>应用场合：数据库、系统网站、数据备份、ERP等。</li>
<li>数据采集：DB导出、SQL等方式。</li>
</ul>
<p>结构化的数据的存储和排列是很有规律的，这对查询和修改等操作很有帮助。但是，它的扩展性不好。</p>
<blockquote>
<p>半结构化数据</p>
</blockquote>
<p>半结构化数据是结构化数据的一种形式，它并不符合关系型数据库或其他数据表的形式关联起来的数据模型结构，但包含相关标记，用来分隔语义元素以及对记录和字段进行分层。因此，它也被称为自描述的结构。半结构化数据，属于同一类实体可以有不同的属性，即使他们被组合在一起，这些属性的顺序并不重要。常见的半结构数据有XML如下：</p>
<pre>
<person>
    <name>李花</name>
    <age>13</age>
    <gender>女</gender>
</person>
</pre>

<ul>
<li>数据特点：非关系模型数据，还有一定的格式。</li>
<li>常见格式：比如Email、HTML、XML、JSON等。</li>
<li>应用场合：邮件系统、档案系统、新闻网站等。</li>
<li>数据采集：网络爬虫、数据解析等方式。</li>
</ul>
<p>不同的半结构化数据的属性的个数是不定的。有些人说半结构化数据是以树或者图的数据结构存储的数据，上面的例子中，<person>标签是树的根节点，<name>和<gender>标签是子节点。通过这样的数据格式，可以自由地表达很多有用的信息，包括自我描述信息（元数据）。所以，半结构化数据的扩展性是很好的。</gender></name></person></p>
<blockquote>
<p>非结构化数据</p>
</blockquote>
<p>就是没有固定结构的数据。各种文档、图片、视频/音频等都属于非结构化数据。对于这类数据，我们一般直接整体进行存储，而且一般存储为二进制的数据格式。如下所示：<br><img src="https://i.imgur.com/G5dR5VM.png" alt=""></p>
<ul>
<li>数据特点：没有固定格式的数据</li>
<li>常见格式：Word、PDF、PPT、图片、音视频等。</li>
<li>应用场合：图片识别、人脸识别、医疗影像、文本分析等。</li>
<li>数据采集：网络爬虫、数据存档等方式。</li>
</ul>
<h2 id="常见的文本抽取方法"><a href="#常见的文本抽取方法" class="headerlink" title="常见的文本抽取方法"></a>常见的文本抽取方法</h2><p>针对数据不同形式，通过特定方式的数据采集方式（文档下载、数据库导出、网络爬虫、语音收集、图片解析等等）获取数据，无论是结构化的数据库文件、半结构化的网页数据，还是非结构化的图片、音视频。我们最终的目的都是将数据传入到电脑之中，通过算法模型挖掘其潜在的价值，为最终的AI技术做支撑。不同的是，在结构化和半结构化数据数据集成过程中，我们可以提取相关文本信息，做进一步的数据预处理；而非结构化的图片、音视频我们采用一定的技术手段，获取其对应的数据点矩阵。这一点不太容易理解，我们比如说想解析一张图片的数据，我们知道图片是有长宽高组成的，还包括红蓝绿三种基本色。那么我们就找到对应的多维特征，采用数据点占位表示，比如：</p>
<pre><code>图片名    长(bit)        宽(bit)        红        绿        蓝
猫1        12            100            0        0        1
狗2        101            234            1        1        1
猪3        202            24            0        1        0
</code></pre><p>上面就数据表示猫1这张图片，长宽位点（12,100）处只有蓝色构成；狗2这张图片，长宽位点（101,234）处有红绿蓝3中色构成；猪3这张图片，长宽位点（202,24）处只有绿构成。这就是非结构数据图片转化为数值型数据的原理。完整流程数据挖掘的流程图如下所示：</p>
<p><img src="https://i.imgur.com/ifprovl.png" alt=""></p>
<p>我根据不同的数据类型，采用对应的数据采集方式获取目标数据。这时候的数据质量很差，存在文本格式不同，数据表示形式不同等诸多问题。这里我们单纯的考虑文本信息的处理，就文本信息而言，你采集的数据可能是网页、数据库文件、pdf文档、word文档等等。我们想去处理这些数据，还需要对数据进行集成即转化为统一的数据格式，这里我们就需要文本信息抽取，常见的抽取方式包括以下几个内容：在线格式转换工具、office内置格式转换、自己开发文本抽取工具。详见下图：</p>
<p><img src="https://i.imgur.com/SKmSm17.png" alt=""></p>
<p>经过实际操作会发现采用在线格式转换工具存在几个弊端，其限制文件转化的数据，要么就是收费的；而采用本地的office自带文档，一个个另存为文本，肯定不现实。基于上述情况，我们对工具抽取的弊端总结如下：</p>
<ul>
<li>格式转换后，识别乱码较多</li>
<li>不支持或者限制支持批量处理</li>
<li>批量转化收费问题</li>
<li>格式转换后的txt文件存在编码问题</li>
<li>生成文件名一堆数字乱码</li>
<li>操作不够灵活便捷</li>
</ul>
<p>我们针对以上问题，就去寻求解决方式，那就是自己动手丰衣足食，我们自己去打造批量文本抽取问题，我们期待效果是：</p>
<ul>
<li>支持PDF/Word等多格式文本抽取</li>
<li>自动过滤不符合指定格式的文件</li>
<li>生成的目标文件与原文件目录一致</li>
<li>生成文档采用统一的编码格式保存（如：UTF-8 ）</li>
<li>支持默认保存路径和自定义保存路径</li>
</ul>
<h2 id="抽取Word文档文本"><a href="#抽取Word文档文本" class="headerlink" title="抽取Word文档文本"></a>抽取Word文档文本</h2><p>做word文档抽取工作，我们运行环境是在win10-64bit下，python3.5，Anaconda4.4版本下执行的，所使用的插件是win32com。下载地址：<a href="https://pan.baidu.com/s/1-2BsiTs8XjMIe5Gnh_GFjw" target="_blank" rel="noopener">https://pan.baidu.com/s/1-2BsiTs8XjMIe5Gnh_GFjw</a> 密码: 7j3t<br>预装完win32com以后，以下代码便完成抽取word文本信息。</p>
<blockquote>
<p>算法思路：</p>
</blockquote>
<ul>
<li>定义文件路径和转存路径：split</li>
<li>修改新的文件名：fnmatch</li>
<li>设置完整的保存路径：join</li>
<li>启动应用程序格式转换：Dispatch</li>
<li>保存文本：SaveAs</li>
</ul>
<blockquote>
<p>算法流程：</p>
</blockquote>
<p><img src="https://i.imgur.com/xzvVZra.png" alt=""></p>
<blockquote>
<p>代码实现：</p>
<p><pre></pre></p>
<h1 id="coding-utf-8"><a href="#coding-utf-8" class="headerlink" title="coding=utf-8"></a>coding=utf-8</h1></blockquote>
<p>“””<br>Description: Word文件转化TXT文本<br>Author：伏草惟存<br>Prompt: code in Python3 env<br>Install package： pip install pypiwin32<br>“””</p>
<p>import os,fnmatch<br>from win32com import client as wc<br>from win32com.client import Dispatch</p>
<p>‘’’<br>功能描述：word文件转存txt文件，默认存储当前路径下；用户可以指定存储文件路径。<br>参数描述：1 filePath：文件路径   2 savePath： 指定保存路径<br>‘’’<br>def Word2Txt(filePath,savePath=’’):</p>
<pre><code># 1 切分文件上级目录和文件名
dirs,filename = os.path.split(filePath)
# print(dirs,&#39;\n&#39;,filename)

# 2 修改转化后的文件名
new_name = &#39;&#39;
if fnmatch.fnmatch(filename,&#39;*.doc&#39;):
    new_name = filename[:-4]+&#39;.txt&#39;
elif fnmatch.fnmatch(filename,&#39;*.docx&#39;):
    new_name = filename[:-5]+&#39;.txt&#39;
else: return
print(&#39;-&gt;&#39;,new_name)

# 3 文件转化后的保存路径
if savePath==&#39;&#39;: savePath = dirs
else: savePath = savePath
word_to_txt = os.path.join(savePath,new_name)
print(&#39;-&gt;&#39;,word_to_txt)

# 4 加载处理应用,word转化txt
wordapp = wc.Dispatch(&#39;Word.Application&#39;)
mytxt = wordapp.Documents.Open(filePath)
mytxt.SaveAs(word_to_txt,4)
mytxt.Close()
</code></pre><p>if <strong>name</strong>==’<strong>main</strong>‘:<br>    filepath = os.path.abspath(r’../dataSet/filename.doc’)</p>
<pre><code># savepath = &#39;&#39;
Word2Txt(filepath)
</code></pre><p>&lt;/pre&gt;</p>
<h2 id="抽取PDF文档文本"><a href="#抽取PDF文档文本" class="headerlink" title="抽取PDF文档文本"></a>抽取PDF文档文本</h2><blockquote>
<p>算法思路：</p>
</blockquote>
<ul>
<li>定义文件路径和转存路径：split</li>
<li>修改新的文件名：fnmatch</li>
<li>设置完整的保存路径：join</li>
<li>启动应用程序格式转换：Dispatch</li>
<li>保存文本：SaveAs</li>
</ul>
<blockquote>
<p>算法流程：</p>
</blockquote>
<p><img src="https://i.imgur.com/mvEnEch.png" alt=""></p>
<blockquote>
<p>代码实现：</p>
</blockquote>
<pre>
# coding=utf-8

"""
Description: PDF文件转化TXT文本
Author：伏草惟存
Prompt: code in Python3 env
"""

import os,fnmatch
from win32com import client as wc
from win32com.client import Dispatch,gencache


'''
功能描述：pdf文件转化txt文本
参数描述：1 filePath：文件路径  2 savePath： 指定保存路径
'''
def Pdf2Txt(filePath,savePath=''):
    # 1 切分文件上级目录和文件名
    dirs,filename = os.path.split(filePath)
    # print('目录：',dirs,'\n文件名：',filename)

    # 2 修改转化后的文件名
    new_name = ""
    if fnmatch.fnmatch(filename,'*.pdf') or fnmatch.fnmatch(filename,'*.PDF'):
        new_name = filename[:-4]+'.txt' # 截取".pdf"之前的文件名
    else: return
    print('新的文件名：',new_name)

    # 3 文件转化后的保存路径
    if savePath=="": savePath = dirs
    else: savePath = savePath
    pdf_to_txt = os.path.join(savePath,new_name)
    print('保存路径：',pdf_to_txt)

    # 4 加载处理应用,pdf转化txt
    wordapp = wc.Dispatch('Word.Application')
    mytxt = wordapp.Documents.Open(filePath)
    mytxt.SaveAs(pdf_to_txt,4)
    mytxt.Close()



if __name__=='__main__':
    # 使用绝对路径
    filePath = os.path.abspath(r'../dataSet/Corpus/pdftotxt/2018年世界新闻自由日.pdf')
    # savePath = r'E:\\'
    Pdf2Txt(filePath)
</pre>

<h2 id="文本抽取工具与编码"><a href="#文本抽取工具与编码" class="headerlink" title="文本抽取工具与编码"></a>文本抽取工具与编码</h2><blockquote>
<p>算法思路：</p>
</blockquote>
<ul>
<li>定义文件夹路径和转存夹路径：split</li>
<li>修改新的文件名：TranType(filename， typename)、fnmatch</li>
<li>设置完整的保存路径：join</li>
<li>启动应用程序格式转换：Dispatch</li>
<li>保存文本：SaveAs</li>
</ul>
<blockquote>
<p>代码实现</p>
<p><pre></pre></p>
<h1 id="coding-utf-8-1"><a href="#coding-utf-8-1" class="headerlink" title="coding=utf-8"></a>coding=utf-8</h1></blockquote>
<p>“””<br>Description: 多文档格式转换工具<br>Author：伏草惟存<br>Prompt: code in Python3 env<br>“””</p>
<p>import os,fnmatch<br>from win32com import client as wc<br>from win32com.client import Dispatch,gencache</p>
<p>‘’’<br>功能描述：抽取文件文本信息<br>参数描述：1 filePath：文件路径  2 savePath： 指定保存路径<br>‘’’<br>def Files2Txt(filePath,savePath=’’):<br>    try:</p>
<pre><code>    # 1 切分文件上级目录和文件名
    dirs,filename = os.path.split(filePath)
    # print(&#39;目录：&#39;,dirs,&#39;\n文件名：&#39;,filename)

    # 2 修改转化后的文件名
    typename = os.path.splitext(filename)[-1].lower() # 获取后缀
    new_name = TranType(filename,typename)
    # print(&#39;新的文件名：&#39;,new_name)

    # 3 文件转化后的保存路径
    if savePath==&quot;&quot;: savePath = dirs
    else: savePath = savePath
    new_save_path = os.path.join(savePath,new_name)
    print(&#39;保存路径：&#39;,new_save_path)

    # 4 加载处理应用
    wordapp = wc.Dispatch(&#39;Word.Application&#39;)
    mytxt = wordapp.Documents.Open(filePath)
    mytxt.SaveAs(new_save_path,4)
    mytxt.Close()
except Exception as e:
    pass
</code></pre><p>‘’’<br>功能描述：根据文件后缀修改文件名<br>参数描述：1 filePath：文件路径  2 typename 文件后缀<br>返回数据：new_name 返回修改后的文件名<br>‘’’<br>def TranType(filename,typename):</p>
<pre><code># 新的文件名称
new_name = &quot;&quot;
if typename == &#39;.pdf&#39; : # pdf-&gt;txt
    if fnmatch.fnmatch(filename,&#39;*.pdf&#39;) :
        new_name = filename[:-4]+&#39;.txt&#39; # 截取&quot;.pdf&quot;之前的文件名
    else: return
elif typename == &#39;.doc&#39; or typename == &#39;.docx&#39; :  # word-&gt;txt
    if fnmatch.fnmatch(filename, &#39;*.doc&#39;) :
        new_name = filename[:-4]+&#39;.txt&#39;
    elif fnmatch.fnmatch(filename, &#39;*.docx&#39;):
        new_name = filename[:-5]+&#39;.txt&#39;
    else: return
else:
    print(&#39;警告：\n您输入[&#39;,typename,&#39;]不合法，本工具支持pdf/doc/docx格式,请输入正确格式。&#39;)
    return
return new_name
</code></pre><p>if <strong>name</strong> == ‘<strong>main</strong>‘:<br>    filePath1 = os.path.abspath(r’../dataSet/Corpus/wordtotxt/一种改进的朴素贝叶斯文本分类方法研究.doc’)<br>    filePath2 = os.path.abspath(r’../dataSet/Corpus/pdftotxt/改进朴素贝叶斯文本分类方法研究.pdf’)<br>    filePath3 = os.path.abspath(r’../dataSet/Corpus/wordtotxt/科技项目数据挖掘决策架构.docx’)<br>    Files2Txt(filePath3)<br>&lt;/pre&gt;</p>
<blockquote>
<p>遍历读取文件</p>
</blockquote>
<ul>
<li>遍历文件的类TraversalFun ： TraversalDir、 AllFiles</li>
<li>遍历目录文件TraversalDir ： AllFiles(self.rootDir)</li>
<li>递归遍历文件AllFiles： AllFiles(self,rootDir)</li>
<li>判断是否为文件isfile ：打印出文件名</li>
<li>判断是否是目录isdir ：递归遍历</li>
</ul>
<blockquote>
<p>遍历文件源码实现</p>
</blockquote>
<pre>
# coding=utf-8

"""
Description: 遍历读取文件名
Author：伏草惟存
Prompt: code in Python3 env
"""

import os,time


'''
功能描述：遍历目录处理子文件
参数描述： 1 rootDir 目标文件的根目录
'''
class TraversalFun():
    # 1 初始化
    def __init__(self,rootDir):
        self.rootDir = rootDir # 目录路径

    # 2 遍历目录文件
    def TraversalDir(self):
        TraversalFun.AllFiles(self,self.rootDir)

    # 3 递归遍历所有文件，并提供具体文件操作功能
    def AllFiles(self,rootDir):
        # 返回指定目录包含的文件或文件夹的名字的列表
        for lists in os.listdir(rootDir):
            # 待处理文件夹名字集合
            path = os.path.join(rootDir, lists)
            # 核心算法，对文件具体操作
            if os.path.isfile(path):
                print(os.path.abspath(path))
            # 递归遍历文件目录
            elif os.path.isdir(path):
                TraversalFun.AllFiles(self,path)



if __name__ == '__main__':
    time_start=time.time()

    # 根目录文件路径
    rootDir = r"../dataSet/Corpus/EnPapers"
    tra=TraversalFun(rootDir) # 默认方法参数打印所有文件路径
    tra.TraversalDir()     # 遍历文件并进行相关操作

    time_end=time.time()
    print('totally cost',time_end-time_start,'s')
</pre>

<h2 id="实战案例：遍历文件批量抽取新闻文本内容"><a href="#实战案例：遍历文件批量抽取新闻文本内容" class="headerlink" title="实战案例：遍历文件批量抽取新闻文本内容"></a>实战案例：遍历文件批量抽取新闻文本内容</h2><blockquote>
<p>算法思路</p>
</blockquote>
<ul>
<li>引用外部文本抽取模块：import ExtractTxt as ET</li>
<li>参数方法使用：TraversalFun(rootDir,ET.Files2Txt,saveDir)</li>
<li>创建保存根目录：os.path.abspath</li>
<li>递归遍历文件：func(path, save_dir)</li>
</ul>
<blockquote>
<p>源码实现</p>
</blockquote>
<pre>
# coding=utf-8

"""
Description: 批量文档格式自动转化txt
Author：伏草惟存
Prompt: code in Python3 env
"""

import ExtractTxt as ET
import os,time


'''
功能描述：遍历目录，对子文件单独处理
参数描述：1 rootDir 根目录  2 deffun：方法参数  3 saveDir: 保存路径
'''
class TraversalFun():
    # 1 初始化
    def __init__(self,rootDir,func=None,saveDir=""):
        self.rootDir = rootDir # 目录路径
        self.func = func   # 参数方法
        self.saveDir = saveDir # 保存路径

    # 2 遍历目录文件
    def TraversalDir(self):
        # 切分文件上级目录和文件名
        dirs,latername = os.path.split(self.rootDir)
        # print(rootDir,'\n',dirs,'\n',latername)

        # 保存目录
        save_dir = ""
        if self.saveDir=="": # 默认文件保存路径
            save_dir = os.path.abspath(os.path.join(dirs,'new_'+latername))
        else: save_dir = self.saveDir

        # 创建目录文件
        if not os.path.exists(save_dir): os.makedirs(save_dir)
        print("保存目录：\n"+save_dir)

        # 遍历文件并将其转化txt文件
        TraversalFun.AllFiles(self,self.rootDir,save_dir)


    # 3 递归遍历所有文件，并提供具体文件操作功能
    def AllFiles(self,rootDir,save_dir=''):
        # 返回指定目录包含的文件或文件夹的名字的列表
        for lists in os.listdir(rootDir):
            # 待处理文件夹名字集合
            path = os.path.join(rootDir, lists)

            # 核心算法，对文件具体操作
            if os.path.isfile(path):
                self.func(os.path.abspath(path),os.path.abspath(save_dir))

            # 递归遍历文件目录
            if os.path.isdir(path):
                newpath = os.path.join(save_dir, lists)
                if not os.path.exists(newpath):
                    os.mkdir(newpath)
                TraversalFun.AllFiles(self,path,newpath)




if __name__ == '__main__':
    time_start=time.time()

    # 根目录文件路径
    rootDir = r"../dataSet/Corpus/EnPapers"
    # saveDir = r"./Corpus/TxtEnPapers"
    tra=TraversalFun(rootDir,ET.Files2Txt) # 默认方法参数打印所有文件路径
    tra.TraversalDir()                   # 遍历文件并进行相关操作

    time_end=time.time()
    print('totally cost',time_end-time_start,'s')
</pre>

<h2 id="源码获取"><a href="#源码获取" class="headerlink" title="源码获取"></a>源码获取</h2><blockquote>
<p>源码请进【机器学习和自然语言QQ群：436303759】文件下载：<a target="_blank" href="http://shang.qq.com/wpa/qunwpa?idkey=ef3bbb679b06ac59b136c57ba9e7935ff9d3b10faeabde6e4efcafe523bbbf4d"><img border="0" src="http://pub.idqqimg.com/wpa/images/group.png" alt="自然语言处理和机器学习技术QQ交流" title="自然语言处理和机器学习技术交流"></a></p>
</blockquote>
<p><img src="https://i.imgur.com/NEXhm2W.png" alt=""></p>
<h2 id="作者声明"><a href="#作者声明" class="headerlink" title="作者声明"></a>作者声明</h2><blockquote>
<p>本文版权归作者所有，旨在技术交流使用。未经作者同意禁止转载，转载后需在文章页面明显位置给出原文连接，否则相关责任自行承担。文版权归作者所有，旨在技术交流使用。未经作者同意禁止转载，转载后需在文章页面明显位置给出原文连接，否则相关责任自行承担。</p>
</blockquote>

      
    </div>

    

    
    
    

    
      <div>
        <div id="wechat_subscriber" style="display: block; padding: 10px 0; margin: 20px auto; width: 100%; text-align: center">
    <img id="wechat_subscriber_qcode" src="/uploads/wechat.png" alt="白宁超 wechat" style="width: 200px; max-width: 100%;"/>
    <div>扫一扫关注微信公众号，机器学习和自然语言处理，订阅号datathinks！</div>
</div>

      </div>
    

    
      <div>
        <div style="padding: 10px 0; margin: 20px auto; width: 90%; text-align: center;">
  <div></div>
  <button id="rewardButton" disable="enable" onclick="var qr = document.getElementById('QR'); if (qr.style.display === 'none') {qr.style.display='block';} else {qr.style.display='none'}">
    <span>打赏</span>
  </button>
  <div id="QR" style="display: none;">

    
      <div id="wechat" style="display: inline-block">
        <img id="wechat_qr" src="/images/wechatpay.jpg" alt="白宁超 微信支付"/>
        <p>微信支付</p>
      </div>
    

    
      <div id="alipay" style="display: inline-block">
        <img id="alipay_qr" src="/images/alipay.jpg" alt="白宁超 支付宝"/>
        <p>支付宝</p>
      </div>
    

    

  </div>
</div>

      </div>
    

    

    <footer class="post-footer">
      
        <div class="post-tags">
          
            <a href="/tags/Python/" rel="tag"><i class="fa fa-tag"></i> Python</a>
          
            <a href="/tags/数据预处理/" rel="tag"><i class="fa fa-tag"></i> 数据预处理</a>
          
            <a href="/tags/pywin32/" rel="tag"><i class="fa fa-tag"></i> pywin32</a>
          
        </div>
      

      
      
        <div class="post-widgets">
        

        

        
          
          <div class="social_share">
            
               <div>
                 
  <div class="bdsharebuttonbox">
    <a href="#" class="bds_tsina" data-cmd="tsina" title="分享到新浪微博"></a>
    <a href="#" class="bds_douban" data-cmd="douban" title="分享到豆瓣网"></a>
    <a href="#" class="bds_sqq" data-cmd="sqq" title="分享到QQ好友"></a>
    <a href="#" class="bds_qzone" data-cmd="qzone" title="分享到QQ空间"></a>
    <a href="#" class="bds_weixin" data-cmd="weixin" title="分享到微信"></a>
    <a href="#" class="bds_tieba" data-cmd="tieba" title="分享到百度贴吧"></a>
    <a href="#" class="bds_twi" data-cmd="twi" title="分享到Twitter"></a>
    <a href="#" class="bds_fbook" data-cmd="fbook" title="分享到Facebook"></a>
    <a href="#" class="bds_more" data-cmd="more"></a>
    <a class="bds_count" data-cmd="count"></a>
  </div>
  <script>
    window._bd_share_config = {
      "common": {
        "bdText": "",
        "bdMini": "2",
        "bdMiniList": false,
        "bdPic": ""
      },
      "share": {
        "bdSize": "16",
        "bdStyle": "0"
      },
      "image": {
        "viewList": ["tsina", "douban", "sqq", "qzone", "weixin", "twi", "fbook"],
        "viewText": "分享到：",
        "viewSize": "16"
      }
    }
  </script>

<script>
  with(document)0[(getElementsByTagName('head')[0]||body).appendChild(createElement('script')).src='/static/api/js/share.js?cdnversion='+~(-new Date()/36e5)];
</script>

               </div>
            
            
               <div id="needsharebutton-postbottom">
                 <span class="btn">
                    <i class="fa fa-share-alt" aria-hidden="true"></i>
                 </span>
               </div>
            
          </div>
        
        </div>
      
      

      
        <div class="post-nav">
          <div class="post-nav-next post-nav-item">
            
              <a href="/2018/11/29/天府大讲堂：5G时代的物联网发展趋势与产业变革/" rel="next" title="天府大讲堂：5G时代的物联网发展趋势与产业变革">
                <i class="fa fa-chevron-left"></i> 天府大讲堂：5G时代的物联网发展趋势与产业变革
              </a>
            
          </div>

          <span class="post-nav-divider"></span>

          <div class="post-nav-prev post-nav-item">
            
              <a href="/2018/12/24/Python数据预处理：机器学习、人工智能通用技术（1）/" rel="prev" title="Python数据预处理：机器学习、人工智能通用技术（1）">
                Python数据预处理：机器学习、人工智能通用技术（1） <i class="fa fa-chevron-right"></i>
              </a>
            
          </div>
        </div>
      

      
      
    </footer>
  </div>
  
  
  
  </article>


  </div>


          </div>
          

  
    <div class="comments" id="comments">
      <div id="lv-container" data-id="city" data-uid="MTAyMC8zOTc5NC8xNjMyMQ=="></div>
    </div>

  
 





        </div>
        
          
  
  <div class="sidebar-toggle">
    <div class="sidebar-toggle-line-wrap">
      <span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
    </div>
  </div>

  <aside id="sidebar" class="sidebar">
    
    <div class="sidebar-inner">

      

      
        <ul class="sidebar-nav motion-element">
          <li class="sidebar-nav-toc sidebar-nav-active" data-target="post-toc-wrap">
            文章目录
          </li>
          <li class="sidebar-nav-overview" data-target="site-overview-wrap">
            站点概览
          </li>
        </ul>
      

      <section class="site-overview-wrap sidebar-panel">
        <div class="site-overview">
          <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
            
              <img class="site-author-image" itemprop="image"
                src="/../images/header.png"
                alt="白宁超" />
            
              <p class="site-author-name" itemprop="name">白宁超</p>
              <p class="site-description motion-element" itemprop="description">本站主要研究深度学习、机器学习、自然语言处理等前沿技术。ML&NLP交流群：436303759 <span><a target="_blank" href="http://shang.qq.com/wpa/qunwpa?idkey=ef3bbb679b06ac59b136c57ba9e7935ff9d3b10faeabde6e4efcafe523bbbf4d"><img border="0" src="http://pub.idqqimg.com/wpa/images/group.png" alt="自然语言处理和机器学习技术QQ交流：436303759 " title="自然语言处理和机器学习技术交流"></a></span></p>
          </div>

          
            <nav class="site-state motion-element">
              
                <div class="site-state-item site-state-posts">
                
                  <a href="/archives">
                
                    <span class="site-state-item-count">65</span>
                    <span class="site-state-item-name">日志</span>
                  </a>
                </div>
              

              
                
                
                <div class="site-state-item site-state-categories">
                  <a href="/categories/index.html">
                    
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                    <span class="site-state-item-count">29</span>
                    <span class="site-state-item-name">分类</span>
                  </a>
                </div>
              

              
                
                
                <div class="site-state-item site-state-tags">
                  <a href="/tags/index.html">
                    
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                    <span class="site-state-item-count">119</span>
                    <span class="site-state-item-name">标签</span>
                  </a>
                </div>
              
            </nav>
          

          
            <div class="feed-link motion-element">
              <a href="/atom.xml" rel="alternate">
                <i class="fa fa-rss"></i>
                RSS
              </a>
            </div>
          

          
            <div class="links-of-author motion-element">
              
                <span class="links-of-author-item">
                  <a href="https://github.com/bainingchao" target="_blank" title="GitHub" rel="external nofollow"><i class="fa fa-fw fa-github"></i>GitHub</a>
                  
                </span>
              
                <span class="links-of-author-item">
                  <a href="https://www.google.com.hk/" target="_blank" title="Google" rel="external nofollow"><i class="fa fa-fw fa-google"></i>Google</a>
                  
                </span>
              
                <span class="links-of-author-item">
                  <a href="https://www.baidu.com/" target="_blank" title="百度" rel="external nofollow"><i class="fa fa-fw fa-globe"></i>百度</a>
                  
                </span>
              
                <span class="links-of-author-item">
                  <a href="https://weibo.com/p/1005056002073632?is_all=1" target="_blank" title="微博" rel="external nofollow"><i class="fa fa-fw fa-weibo"></i>微博</a>
                  
                </span>
              
                <span class="links-of-author-item">
                  <a href="http://www.cnblogs.com/baiboy/" target="_blank" title="博客园" rel="external nofollow"><i class="fa fa-fw fa-globe"></i>博客园</a>
                  
                </span>
              
                <span class="links-of-author-item">
                  <a href="https://mp.weixin.qq.com/s/s97I4gtEJIt5rMivWMkPkQ" target="_blank" title="微信公众号" rel="external nofollow"><i class="fa fa-fw fa-weixin"></i>微信公众号</a>
                  
                </span>
              
            </div>
          

          
          

          
          

          
            
          
          

        </div>
      </section>

      
      <!--noindex-->
        <section class="post-toc-wrap motion-element sidebar-panel sidebar-panel-active">
          <div class="post-toc">

            
              
            

            
              <div class="post-toc-content"><ol class="nav"><li class="nav-item nav-level-2"><a class="nav-link" href="#数据类型与数据采集"><span class="nav-number">1.</span> <span class="nav-text">数据类型与数据采集</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#常见的文本抽取方法"><span class="nav-number">2.</span> <span class="nav-text">常见的文本抽取方法</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#抽取Word文档文本"><span class="nav-number">3.</span> <span class="nav-text">抽取Word文档文本</span></a></li></ol></li><li class="nav-item nav-level-1"><a class="nav-link" href="#coding-utf-8"><span class="nav-number"></span> <span class="nav-text">coding=utf-8</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#抽取PDF文档文本"><span class="nav-number">1.</span> <span class="nav-text">抽取PDF文档文本</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#文本抽取工具与编码"><span class="nav-number">2.</span> <span class="nav-text">文本抽取工具与编码</span></a></li></ol></li><li class="nav-item nav-level-1"><a class="nav-link" href="#coding-utf-8-1"><span class="nav-number"></span> <span class="nav-text">coding=utf-8</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#实战案例：遍历文件批量抽取新闻文本内容"><span class="nav-number">1.</span> <span class="nav-text">实战案例：遍历文件批量抽取新闻文本内容</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#源码获取"><span class="nav-number">2.</span> <span class="nav-text">源码获取</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#作者声明"><span class="nav-number">3.</span> <span class="nav-text">作者声明</span></a></li></ol></div>
            

          </div>
        </section>
      <!--/noindex-->
      

      

    </div>
  </aside>


        
      </div>
    </main>

    <footer id="footer" class="footer">
      <div class="footer-inner">
        <script async src="https://dn-lbstatics.qbox.me/busuanzi/2.3/busuanzi.pure.mini.js">
</script>

<div class="copyright">&copy; <span itemprop="copyrightYear">2019</span>
  <span class="with-love" id="animate">
    <i class="fa fa-user"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">白宁超</span>

  

  
</div>




  



  <!--<div class="powered-by">由 <a class="theme-link" target="_blank" rel="external nofollow" href="https://hexo.io">Hexo</a> 强力驱动 v3.7.1</div> -->



   <!--<span class="post-meta-divider">|</span>-->



   <!--<div class="theme-info">主题 – <a class="theme-link" target="_blank" rel="external nofollow" href="https://theme-next.org">NexT.Gemini</a> v6.4.1</div>-->




        <script async src="//busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>



<div class="busuanzi-count">
  
    <span class="site-uv" title="总访客量">
      <i class="fa fa-user"></i>
      <span class="busuanzi-value" id="busuanzi_value_site_uv"></span>
    </span>
  

  
    <span class="site-pv" title="总访问量">
      <i class="fa fa-eye"></i>
      <span class="busuanzi-value" id="busuanzi_value_site_pv"></span>
    </span>
  
</div>









        
      </div>
    </footer>

    
      <div class="back-to-top">
        <i class="fa fa-arrow-up"></i>
        
      </div>
    

    
	
    

    
  </div>

  

<script type="text/javascript">
  if (Object.prototype.toString.call(window.Promise) !== '[object Function]') {
    window.Promise = null;
  }
</script>


























  
  
    <script type="text/javascript" src="/lib/jquery/index.js?v=2.1.3"></script>
  

  
  
    <script type="text/javascript" src="/lib/velocity/velocity.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/lib/velocity/velocity.ui.min.js?v=1.2.1"></script>
  


  


  <script type="text/javascript" src="/js/src/utils.js?v=6.4.1"></script>

  <script type="text/javascript" src="/js/src/motion.js?v=6.4.1"></script>



  
  


  <script type="text/javascript" src="/js/src/affix.js?v=6.4.1"></script>

  <script type="text/javascript" src="/js/src/schemes/pisces.js?v=6.4.1"></script>



  
  <script type="text/javascript" src="/js/src/scrollspy.js?v=6.4.1"></script>
<script type="text/javascript" src="/js/src/post-details.js?v=6.4.1"></script>



  


  <script type="text/javascript" src="/js/src/bootstrap.js?v=6.4.1"></script>



  



  
    <script type="text/javascript">
      window.livereOptions = {
        refer: '2018/12/21/数据预处理之抽取文本信息（2）/'
      };
      (function(d, s) {
        var j, e = d.getElementsByTagName(s)[0];
        if (typeof LivereTower === 'function') { return; }
        j = d.createElement(s);
        j.src = 'https://cdn-city.livere.com/js/embed.dist.js';
        j.async = true;
        e.parentNode.insertBefore(j, e);
      })(document, 'script');
    </script>
  










  

  <script type="text/javascript">
    // Popup Window;
    var isfetched = false;
    var isXml = true;
    // Search DB path;
    var search_path = "search.xml";
    if (search_path.length === 0) {
      search_path = "search.xml";
    } else if (/json$/i.test(search_path)) {
      isXml = false;
    }
    var path = "/" + search_path;
    // monitor main search box;

    var onPopupClose = function (e) {
      $('.popup').hide();
      $('#local-search-input').val('');
      $('.search-result-list').remove();
      $('#no-result').remove();
      $(".local-search-pop-overlay").remove();
      $('body').css('overflow', '');
    }

    function proceedsearch() {
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay"></div>')
        .css('overflow', 'hidden');
      $('.search-popup-overlay').click(onPopupClose);
      $('.popup').toggle();
      var $localSearchInput = $('#local-search-input');
      $localSearchInput.attr("autocapitalize", "none");
      $localSearchInput.attr("autocorrect", "off");
      $localSearchInput.focus();
    }

    // search function;
    var searchFunc = function(path, search_id, content_id) {
      'use strict';

      // start loading animation
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay">' +
          '<div id="search-loading-icon">' +
          '<i class="fa fa-spinner fa-pulse fa-5x fa-fw"></i>' +
          '</div>' +
          '</div>')
        .css('overflow', 'hidden');
      $("#search-loading-icon").css('margin', '20% auto 0 auto').css('text-align', 'center');

      

      $.ajax({
        url: path,
        dataType: isXml ? "xml" : "json",
        async: true,
        success: function(res) {
          // get the contents from search data
          isfetched = true;
          $('.popup').detach().appendTo('.header-inner');
          var datas = isXml ? $("entry", res).map(function() {
            return {
              title: $("title", this).text(),
              content: $("content",this).text(),
              url: $("url" , this).text()
            };
          }).get() : res;
          var input = document.getElementById(search_id);
          var resultContent = document.getElementById(content_id);
          var inputEventFunction = function() {
            var searchText = input.value.trim().toLowerCase();
            var keywords = searchText.split(/[\s\-]+/);
            if (keywords.length > 1) {
              keywords.push(searchText);
            }
            var resultItems = [];
            if (searchText.length > 0) {
              // perform local searching
              datas.forEach(function(data) {
                var isMatch = false;
                var hitCount = 0;
                var searchTextCount = 0;
                var title = data.title.trim();
                var titleInLowerCase = title.toLowerCase();
                var content = data.content.trim().replace(/<[^>]+>/g,"");
                
                var contentInLowerCase = content.toLowerCase();
                var articleUrl = decodeURIComponent(data.url);
                var indexOfTitle = [];
                var indexOfContent = [];
                // only match articles with not empty titles
                if(title != '') {
                  keywords.forEach(function(keyword) {
                    function getIndexByWord(word, text, caseSensitive) {
                      var wordLen = word.length;
                      if (wordLen === 0) {
                        return [];
                      }
                      var startPosition = 0, position = [], index = [];
                      if (!caseSensitive) {
                        text = text.toLowerCase();
                        word = word.toLowerCase();
                      }
                      while ((position = text.indexOf(word, startPosition)) > -1) {
                        index.push({position: position, word: word});
                        startPosition = position + wordLen;
                      }
                      return index;
                    }

                    indexOfTitle = indexOfTitle.concat(getIndexByWord(keyword, titleInLowerCase, false));
                    indexOfContent = indexOfContent.concat(getIndexByWord(keyword, contentInLowerCase, false));
                  });
                  if (indexOfTitle.length > 0 || indexOfContent.length > 0) {
                    isMatch = true;
                    hitCount = indexOfTitle.length + indexOfContent.length;
                  }
                }

                // show search results

                if (isMatch) {
                  // sort index by position of keyword

                  [indexOfTitle, indexOfContent].forEach(function (index) {
                    index.sort(function (itemLeft, itemRight) {
                      if (itemRight.position !== itemLeft.position) {
                        return itemRight.position - itemLeft.position;
                      } else {
                        return itemLeft.word.length - itemRight.word.length;
                      }
                    });
                  });

                  // merge hits into slices

                  function mergeIntoSlice(text, start, end, index) {
                    var item = index[index.length - 1];
                    var position = item.position;
                    var word = item.word;
                    var hits = [];
                    var searchTextCountInSlice = 0;
                    while (position + word.length <= end && index.length != 0) {
                      if (word === searchText) {
                        searchTextCountInSlice++;
                      }
                      hits.push({position: position, length: word.length});
                      var wordEnd = position + word.length;

                      // move to next position of hit

                      index.pop();
                      while (index.length != 0) {
                        item = index[index.length - 1];
                        position = item.position;
                        word = item.word;
                        if (wordEnd > position) {
                          index.pop();
                        } else {
                          break;
                        }
                      }
                    }
                    searchTextCount += searchTextCountInSlice;
                    return {
                      hits: hits,
                      start: start,
                      end: end,
                      searchTextCount: searchTextCountInSlice
                    };
                  }

                  var slicesOfTitle = [];
                  if (indexOfTitle.length != 0) {
                    slicesOfTitle.push(mergeIntoSlice(title, 0, title.length, indexOfTitle));
                  }

                  var slicesOfContent = [];
                  while (indexOfContent.length != 0) {
                    var item = indexOfContent[indexOfContent.length - 1];
                    var position = item.position;
                    var word = item.word;
                    // cut out 100 characters
                    var start = position - 20;
                    var end = position + 80;
                    if(start < 0){
                      start = 0;
                    }
                    if (end < position + word.length) {
                      end = position + word.length;
                    }
                    if(end > content.length){
                      end = content.length;
                    }
                    slicesOfContent.push(mergeIntoSlice(content, start, end, indexOfContent));
                  }

                  // sort slices in content by search text's count and hits' count

                  slicesOfContent.sort(function (sliceLeft, sliceRight) {
                    if (sliceLeft.searchTextCount !== sliceRight.searchTextCount) {
                      return sliceRight.searchTextCount - sliceLeft.searchTextCount;
                    } else if (sliceLeft.hits.length !== sliceRight.hits.length) {
                      return sliceRight.hits.length - sliceLeft.hits.length;
                    } else {
                      return sliceLeft.start - sliceRight.start;
                    }
                  });

                  // select top N slices in content

                  var upperBound = parseInt('1');
                  if (upperBound >= 0) {
                    slicesOfContent = slicesOfContent.slice(0, upperBound);
                  }

                  // highlight title and content

                  function highlightKeyword(text, slice) {
                    var result = '';
                    var prevEnd = slice.start;
                    slice.hits.forEach(function (hit) {
                      result += text.substring(prevEnd, hit.position);
                      var end = hit.position + hit.length;
                      result += '<b class="search-keyword">' + text.substring(hit.position, end) + '</b>';
                      prevEnd = end;
                    });
                    result += text.substring(prevEnd, slice.end);
                    return result;
                  }

                  var resultItem = '';

                  if (slicesOfTitle.length != 0) {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + highlightKeyword(title, slicesOfTitle[0]) + "</a>";
                  } else {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + title + "</a>";
                  }

                  slicesOfContent.forEach(function (slice) {
                    resultItem += "<a href='" + articleUrl + "'>" +
                      "<p class=\"search-result\">" + highlightKeyword(content, slice) +
                      "...</p>" + "</a>";
                  });

                  resultItem += "</li>";
                  resultItems.push({
                    item: resultItem,
                    searchTextCount: searchTextCount,
                    hitCount: hitCount,
                    id: resultItems.length
                  });
                }
              })
            };
            if (keywords.length === 1 && keywords[0] === "") {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-search fa-5x" /></div>'
            } else if (resultItems.length === 0) {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-frown-o fa-5x" /></div>'
            } else {
              resultItems.sort(function (resultLeft, resultRight) {
                if (resultLeft.searchTextCount !== resultRight.searchTextCount) {
                  return resultRight.searchTextCount - resultLeft.searchTextCount;
                } else if (resultLeft.hitCount !== resultRight.hitCount) {
                  return resultRight.hitCount - resultLeft.hitCount;
                } else {
                  return resultRight.id - resultLeft.id;
                }
              });
              var searchResultList = '<ul class=\"search-result-list\">';
              resultItems.forEach(function (result) {
                searchResultList += result.item;
              })
              searchResultList += "</ul>";
              resultContent.innerHTML = searchResultList;
            }
          }

          if ('auto' === 'auto') {
            input.addEventListener('input', inputEventFunction);
          } else {
            $('.search-icon').click(inputEventFunction);
            input.addEventListener('keypress', function (event) {
              if (event.keyCode === 13) {
                inputEventFunction();
              }
            });
          }

          // remove loading animation
          $(".local-search-pop-overlay").remove();
          $('body').css('overflow', '');

          proceedsearch();
        }
      });
    }

    // handle and trigger popup window;
    $('.popup-trigger').click(function(e) {
      e.stopPropagation();
      if (isfetched === false) {
        searchFunc(path, 'local-search-input', 'local-search-result');
      } else {
        proceedsearch();
      };
    });

    $('.popup-btn-close').click(onPopupClose);
    $('.popup').click(function(e){
      e.stopPropagation();
    });
    $(document).on('keyup', function (event) {
      var shouldDismissSearchPopup = event.which === 27 &&
        $('.search-popup').is(':visible');
      if (shouldDismissSearchPopup) {
        onPopupClose();
      }
    });
  </script>





  

  

  
<script>
(function(){
    var bp = document.createElement('script');
    var curProtocol = window.location.protocol.split(':')[0];
    if (curProtocol === 'https') {
        bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';        
    }
    else {
        bp.src = 'http://push.zhanzhang.baidu.com/push.js';
    }
    var s = document.getElementsByTagName("script")[0];
    s.parentNode.insertBefore(bp, s);
})();
</script>


  
  

  
  

  
    
      <script type="text/x-mathjax-config">
    MathJax.Hub.Config({
      tex2jax: {
        inlineMath: [ ['$','$'], ["\\(","\\)"]  ],
        processEscapes: true,
        skipTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code']
      },
      TeX: {equationNumbers: { autoNumber: "AMS" }}
    });
</script>

<script type="text/x-mathjax-config">
    MathJax.Hub.Queue(function() {
      var all = MathJax.Hub.getAllJax(), i;
        for (i=0; i < all.length; i += 1) {
          all[i].SourceElement().parentNode.className += ' has-jax';
        }
    });
</script>
<script type="text/javascript" src="//cdn.jsdelivr.net/npm/mathjax@2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>

    
  


  
  
  
  <script src="/lib/needsharebutton/needsharebutton.js"></script>

  <script>
    
      pbOptions = {};
      
          pbOptions.iconStyle = "box";
      
          pbOptions.boxForm = "horizontal";
      
          pbOptions.position = "bottomCenter";
      
          pbOptions.networks = "Weibo,Wechat,Douban,QQZone,Linkedin,Facebook";
      
      new needShareButton('#needsharebutton-postbottom', pbOptions);
    
    
  </script>

  

  

  

  

  

  

  <!-- 页面点击小红心 -->
	<script type="text/javascript" src="../js/src/love.js"></script><!-- hexo-inject:begin --><!-- Begin: Injected MathJax -->
<script type="text/x-mathjax-config">
  MathJax.Hub.Config({"tex2jax":{"inlineMath":[["$","$"],["\\(","\\)"]],"skipTags":["script","noscript","style","textarea","pre","code"],"processEscapes":true},"TeX":{"equationNumbers":{"autoNumber":"AMS"}}});
</script>

<script type="text/x-mathjax-config">
  MathJax.Hub.Queue(function() {
    var all = MathJax.Hub.getAllJax(), i;
    for(i=0; i < all.length; i += 1) {
      all[i].SourceElement().parentNode.className += ' has-jax';
    }
  });
</script>

<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js">
</script>
<!-- End: Injected MathJax -->
<!-- hexo-inject:end -->
</body>
</html>
